/*          
    Purpose: This file calculates 1-,5-, and 10-year average
    		 actual income around age 40 for fathers of 
    		 respondents in 2a. 1- and 3-year actual income 
    		 in 1970 is further calculated, as well as predicted 
    		 father income at the occupation x race x south level.

    Creates: Fathers_modaloccs.dta (used in do-files in 
             PSID_only/PSID_bias_bins.do)
*/

clear 
set more off

cd "$Mydirectory1/1_DataSources/PSID"

********************************
** BRING IN DATA AND RESHAPE
********************************

use ./RawData/PSID_raw_indfam.dta, clear
	drop HHlabor* h_w_inc* selfemployed* union* h_w_acc* grade* agehead* intnumber* HHunioncontract* ER* V* laborinc*

* Unique identifier (wide)
	gen father_id = famid*1000 + personnumber
	order father_id, before(famid)
	label var father_id "Father 1968 ID"

/*  Following Mazumder paper, only want certain parts of the PSID: 
    the main nationally representative sample (SRC). 
	The SEO sample (5000<famid<7000) is used in robustness checks. 
	The immigrant samples have values between 3000 and 5000, 
	and Latino families added later have values >7000. 
	
	Source: https://psidonline.isr.umich.edu/guide/faq.aspx   */
	gen src_sample = famid<3000
	gen seo_sample = famid>5000 & famid<7000
	keep if src_sample==1 | seo_sample==1 

/* IMPORTANT: Keep the fathers of adult child PSID respondents who
              gave retrospective answers (identified in 
              2a_PSID_retrospective_children.do) */
	merge 1:1 father_id using ./output/PSID_fathers_retrospective.dta
	keep if _merge==3
	drop _merge 

* Reshape to long
	#delimit ; 
	reshape long age relate race famweight indweight famweight_coreimm mainocc mainocc_retro totfaminc state maritalstatus
			 indweight_coreimm num_ sequencenumber, i(father_id sex) j(year);
	#delimit cr

**----------------------------------------------------**
**----------------------------------------------------**

***********************
* CLEAN DEMOGRAPHIC VARIABLES
***********************

* Head of household
	gen relate2 = relate
	gen hh_head = (relate2==1 & year<1983) | (relate2==10 & year>=1983)
	drop relate*

* Weight: harmonize individual weight variables
	replace indweight = indweight_coreimm if year>=1997 & year<=2015
	
* Age

	//Construct consistent age variable
	gen byr_f = year - age //Note: age = age of individual
	bysort father_id: egen byr_f2 = min(byr_f) //take minimum of all possible birth years
	drop byr_f
	label var byr_f2 "Birth year (using minimum)"
	gen age_f = year - byr_f2 - 1 
	label var age_f "Age of father"
	replace age_f=. if age_f<=0 | age_f>100

* Clean race variable  
	/*Note: Race variable used here is race 
	        of the head*/
	replace race=. if race>2 & race<8 
	replace race=. if race==0 | race==9
	replace race =. if hh_head==0 
	rename race race_2
	
	bysort father_id: egen race=mode(race_2), minmode
	drop race_2
	
* Clean marital status
	/*Note: Marital status variables are asked 
	        of only the head. */
	gen divorced = (marital==4 | marital==5) if marital<.
	replace divorced =. if hh_head==0 
	bysort father_id: egen divorced_ever = max(divorced)
	label var divorced_ever "Ever divorced"

* Currently living in the South
	/*Notes: (1) See https://psidonline.isr.umich.edu/data/Documentation/PSIDStateCodes.pdf
	             for info on state fips codes.
	         (2) The South includes Texas, Oklahoma, Arkansas, Louisiana, 
	             Mississippi, Alabama, Tennessee, Kentucky, FL, GA, SC, 
	             NC, VA, MD, DE, DC, WV (17 states).
	         (3) "State" variable is at the individual level (i.e., exclusive
	             to heads).
 */	
	tab state, m nol 
	gen south_merge = state==42 | state==35 | state==3 | state==17 | state==23 | state==1 | state==41 | state==16 | ///
					  state==9 | state==10 | state==39 | state==32 | state==45 | state==19 | state==7 | state==8 | state==47 if state<.
	tab south_merge, m
	label var south_merge "Living in South"	
	
*-----------------------------------------------------------------*
*-----------------------------------------------------------------*

*********************
*** OCCUPATIONS FOR FATHER
*********************
	
* Current occupation, harmonized across different variables
	replace mainocc=. if mainocc==0 | mainocc==999 | hh_head==0 
	replace mainocc_retro=. if mainocc_retro==0 | mainocc_retro==999 | hh_head==0 
	
	gen father_occ=mainocc if mainocc!=.
	replace father_occ=mainocc_retro if (year>=1968 & year<=1973) | (year>1974 & year<=1980)
	replace father_occ=mainocc_retro if year==1974 & mainocc==. & mainocc_retro!=.
	
	label var father_occ "Father occupation in given year"
	
* Crosswalk PSID occupations (1970 Census-based) to coarsened ANES occupations		
	clonevar census1970 = father_occ
	replace census1970=. if year>2002
	
	merge m:1 census1970 using ../Crosswalks/Crosswalk_1970Census_toANES.dta
	assert _merge!=1
	drop if _merge==2	
	drop _merge
	label var fatheroccej "Father coarsened occupation, using crosswalk"
	
/* The PSID used 1970 Census occupations until 2000. 
   (Temporarily) code father occupation in subsequent 
   years as missing. */
   replace fatheroccej=. if year>2002
		
* Crosswalk PSID occupations (2000 Census-based) to coarsened ANES occupations		
	clonevar occ2000=father_occ
	replace occ2000=. if year<2002
	
	merge m:1 occ2000 using ../Crosswalks/Crosswalk_2000Census_toANES.dta
	assert occ2000==. if _merge==1
	drop if _merge==2
	drop _merge
	
	replace fatheroccej = fatheroccej_2000 if year>=2002
	
	tab mainocc if fatheroccej==., m 
	drop fatheroccej_2000
	
* Predicted father income (i.e., "income scores")
	//occ x race x south level
	merge m:1 fatheroccej race south_merge using ../CensusData/output/IncomeScores_Coarsened_byrace_bysouth.dta
	assert fatheroccej==. | race==. | south_merge==. if _merge==1
	drop if _merge==2
	drop _merge
	
	rename avg_HHinc_byr_bys_CWfix HHincome_1940
	rename avg_HHinc_1970_byocc_byr_bys HHincome_1970 
	rename avg_HHinc_1980_byocc_byr_bys HHincome_1980 
	
	drop avg* 

*-----------------------------------------------------------------*
*-----------------------------------------------------------------*

***********************************
* MORE INCOME/OCCUPATION-RELATED
***********************************

* Fix total family income variable
	sum totfaminc, d
	replace totfaminc=0 if totfaminc<=0 
	sum totfaminc, d
	
* Convert total family income measure to 1950$
	/*Note: Income in the year prior to the 
			survey year is reported.*/
    gen year_CPI = year-1  
	
	merge m:1 year_CPI using ../CPI/CPI_deflator.dta
	drop if _merge==2
	drop _merge
	
	replace totfaminc = totfaminc * deflator
		
/* Keep fathers with at least one occupation 
   between 25 and 50 */
	gen temp2 = fatheroccej<. & (age_f>=25 & age_f<=50)
	bysort father_id: egen number_total_occs = sum(temp2)
	
	label var number_total_occs "Number of total father occs between age 25-50"
	
	drop if number_total_occs==0
	
* Count # of unique occupations per father id
	bysort father_id fatheroccej: gen temp3 = _n==1
	replace temp3=0 if fatheroccej==.
	bysort father_id: egen number_unique_occs = sum(temp3)
	drop temp*
	
	label var number_unique_occs "Number of unique father occs between age 25-50"
	
* Find modal occupation for various age ranges
	sort father_id year
	gen temp = fatheroccej if age_f>=30 & age_f<=50
	bysort father_id: egen mode_occ_30to50 = mode(temp)
	label var mode_occ_30to50 "Modal occupation between age 30 and 50"
	
	gen temp2 = fatheroccej if age_f>=20 & age_f<=40
	bysort father_id: egen mode_occ_20to40 = mode(temp2)
	label var mode_occ_20to40 "Modal occupation between age 20 and 40"
	
	gen temp3 = fatheroccej if age_f>=25 & age_f<=45
	bysort father_id: egen mode_occ_25to45 = mode(temp3)
	label var mode_occ_25to45 "Modal occupation between age 25 and 45"
	
	gen temp4 = fatheroccej if age_f>=25 & age_f<=50
	bysort father_id: egen mode_occ_25to50 = mode(temp4)
	label var mode_occ_25to50 "Modal occupation between age 25 and 50"
	
	bysort father_id: egen mode_occ_25to50_min = mode(temp4), minmode
	label var mode_occ_25to50_min "Modal occupation between age 25 and 50, minmode"
	bysort father_id: egen mode_occ_25to50_max = mode(temp4), maxmode
	label var mode_occ_25to50_max "Modal occupation between age 25 and 50, maxmode"
	
	bysort father_id: egen mode_occ_30to50_min = mode(temp), minmode
	label var mode_occ_30to50_min "Modal occupation between age 30 and 50, minmode"
	bysort father_id: egen mode_occ_30to50_max = mode(temp), maxmode
	label var mode_occ_30to50_max "Modal occupation between age 30 and 50, maxmode"

	drop temp*
	
/* Find all father occupations 
   listed between ages 25 and 50. */
	gen occ_temp = fatheroccej if age_f>=25 & age_f<=50
	
	//Obtain min and max answer
	bysort father_id: egen occ_1 = min(occ_temp)
	bysort father_id: egen occ_2 = max(occ_temp)
	replace occ_2=. if occ_1==occ_2

	//Erase those possibilities and get new min and max
	replace occ_temp=. if occ_temp==occ_1 | occ_temp==occ_2
	
	bysort father_id: egen occ_3 = min(occ_temp)
	bysort father_id: egen occ_4 = max(occ_temp)
	replace occ_4=. if occ_3==occ_4

	//Erase those possibilities and get new min and max	
	replace occ_temp=. if occ_temp==occ_3 | occ_temp==occ_4
	
	bysort father_id: egen occ_5 = min(occ_temp)
	bysort father_id: egen occ_6 = max(occ_temp)
	replace occ_6=. if occ_5==occ_6

	//Erase those possibilities and get new min and max	
	replace occ_temp=. if occ_temp==occ_5 | occ_temp==occ_6

	bysort father_id: egen occ_7 = min(occ_temp)
	bysort father_id: egen occ_8 = max(occ_temp)
	replace occ_8=. if occ_7==occ_8

	//Erase those possibilities and get new min and max	
	replace occ_temp=. if occ_temp==occ_7 | occ_temp==occ_8
	
	bysort father_id: egen occ_9 = min(occ_temp)
	bysort father_id: egen occ_10 = max(occ_temp)
	replace occ_10=. if occ_9==occ_10

	//Erase those possibilities and get new min and max	
	replace occ_temp=. if occ_temp==occ_9 | occ_temp==occ_10
	
	tab occ_temp
	drop occ_temp
	
	foreach var of varlist occ_1-occ_10 {
		rename `var' father_`var'
	}

	forval i=1(1)10 {
		label var father_occ_`i' "Father self-reported occupation b/w age 25-50, `i'"
	}
	
	
/* Grab non-modal income scores from when 
   the father is between 30 and 50 */
	foreach x in 40 70 80  {
		gen temp`x'=HHincome_19`x' if age>=30 & age<=50
	}
	
	forval i=1968(1)1985 {
		foreach x in 40 70 80 {
			gen temp = temp`x' if year==`i'
			bysort father_id: egen incomescore_`i'_`x'= max(temp)
			label var incomescore_`i'_`x' "Father income score in `i', cond. on ages 30-50, `x' based score"
			drop temp		
		}
	}
	
*-----------------------------------------------------------------*
*-----------------------------------------------------------------*

******************************
/* MAZUMDER APPROACH TO 
   FINDING ACTUAL INCOME */ 
******************************

	sort father_id year
	
	foreach var of varlist HHincome_1940 HHincome_1970 HHincome_1980 {
		replace `var'=. if totfaminc==. | totfaminc==0
	}
	
	tempfile fulldata
	save `fulldata'

	global numberyears "1 5 10" 
	foreach c in $numberyears {
		
		use `fulldata', clear
		
    	* Center around age 40
		local father_center=40
		local center `father_center'

    	* Look between 30 and 50
		local father_band=10 
		local band `father_band'

    	* # of observations that'll be tagged (1,5, and 10)
		local fcount=`c' 
		local obs_ct `fcount'

	foreach x of varlist HHincome_1940 HHincome_1970 HHincome_1980 {

        * Binary: data is okay to use (i.e., there's non-missing income at age 40)
		gen indfather_`x' = 0
		replace indfather_`x' = 1 if `x'!=. & age_f==`center'
			
        * Find total of this binary for each father
		by father_id: egen total_indf_`x' = total(indfather_`x')

        /* Now iteratively search the bands starting at the center. 
           Tag the observation in the sample if it's non-missing and 
           the respondent has not reached the the observation count (1,5, or 10). 
           The total number of observations used is then recalculated. 
           Note that the upper band is arbitrarily privileged. 
        */
        forval i = 0/`band' {
            * Upper band (41-50)
			replace indfather_`x' = 1 if `x'!=. & age_f==(`center'+`i') & (total_indf_`x'<`obs_ct')
			
            * Re-calculate the total observations used
			drop total_indf_`x'
			by father_id: egen total_indf_`x' = total(indfather_`x') 

            * Lower band (30-39)
			replace indfather_`x' = 1 if `x'!=. & age_f==(`center'-`i') & total_indf_`x'<`obs_ct'
			
            * Re-calculate the total observations used
			drop total_indf_`x'
			by father_id: egen total_indf_`x' = total(indfather_`x') 
		}

	/* Calculate mean actual income in tagged years and 
	   give that value to the father in every year */
		by father_id: egen mean_`x'_tmp = mean(`x') if indfather_`x'==1
		by father_id: egen mean_`x' = max(mean_`x'_tmp)
		label var mean_`x' "Father's avg. occ. income using `c' years and `x'"
		
	* Calculate mean *actual* total family income
		by father_id: egen temp_`x' = mean(totfaminc) if indfather_`x'==1
		by father_id: egen mean_totfaminc_`x' = max(temp_`x')

	}

	* Clean up income variables
		assert mean_totfaminc_HHincome_1940 == mean_totfaminc_HHincome_1970
		drop mean_totfaminc_HHincome_1970 mean_totfaminc_HHincome_1980
		rename mean_totfaminc_HHincome_1940 mean_totfaminc
		
	* Calculate maximum year of tagged income
		by father_id: egen max_father_year_tmp = max(year) if indfather_HHincome_1940==1
		by father_id: egen max_father_year = max(max_father_year_tmp)
		label var max_father_year "Father's max year"

	* Keep 1 observation per father
		sort father_id
		by father_id: keep if _n==1

	* Drop observations w/o enough years of income
		drop if total_indf_HHincome_1940<`obs_ct'

	* Trim
		drop *_tmp
		keep father_id mean_HHincome* mean_totfaminc* max_father_year 
		
		foreach var of varlist mean_HHincome_1940-max_father_year {
			rename `var' `var'_`c'
		}
		
	* Save tempfile
		tempfile PSIDFathers_`c'
		save `PSIDFathers_`c''
		
	}

*-----------------------------------------------------------------*
*-----------------------------------------------------------------*

****************************************
/* FIND ACTUAL INCOME AROUND
   1970 (~30 YEARS BEFORE 
   1997 WAVE WHEN ADULT CHILD 
   PSID RESPONDENTS IN 2A FIRST REPORT
   RETROSPECTIVE FATHER OCCUPATION */
****************************************

	global numberyears "1 3" 
	foreach c in $numberyears {
		
		use `fulldata', clear
		
		* Center around year 1970
		local father_center=1970
		local center `father_center'

		* Look between 1968 and 1972
		local father_band=2 
		local band `father_band'

        * # of observations that'll be tagged (changes from 1 to 3)
		local fcount=`c' 
		local obs_ct `fcount'

	foreach x of varlist HHincome_1970 {

	    * Binary: data is okay to use (i.e., there's non-missing income in 1970)
		gen indfather_`x' = 0
		replace indfather_`x' = 1 if `x'!=. & year==`center'
			
	    * Find total of this binary for each person
		by father_id: egen total_indf_`x' = total(indfather_`x')

	    /* Now iteratively search the bands starting at the center. 
	       Tag the observation in the sample if it's non-missing and 
	       the respondent has not reached the the observation count (1,or 3).
	       The total number of observations used is then recalculated. 
	       Note that the upper band is arbitrarily privileged. 
	    */  
    	forval i = 0/`band' {
			* Upper band - 1972
			replace indfather_`x' = 1 if `x'!=. & year==(`center'+`i') & (total_indf_`x'<`obs_ct')
			
            * Re-calculate the total observations used
			drop total_indf_`x'
			by father_id: egen total_indf_`x' = total(indfather_`x') 

			* Lower band - 1968
			replace indfather_`x' = 1 if `x'!=. & year==(`center'-`i') & total_indf_`x'<`obs_ct'
			
            * Re-calculate the total observations used
			drop total_indf_`x'
			by father_id: egen total_indf_`x' = total(indfather_`x') 
		}

		/* Calculate mean actual income in tagged years and 
		   give that value to the father in every year */
		by father_id: egen mean_`x'_tmp = mean(`x') if indfather_`x'==1
		by father_id: egen mean_`x' = min(mean_`x'_tmp)
		label var mean_`x' "Father's avg. occ. income using `c' years and `x'"

		}

		rename mean_HHincome_1970 mean_HHinc_around1970_`c'yr

	* Keep 1 observation per father
		sort father_id
		by father_id: keep if _n==1

	* Drop observations w/o enough years of income
		drop if total_indf_HHincome_1970<`obs_ct'

	*Trim
		drop *_tmp
		keep father_id mean_HHinc* 

	* Save tempfile
		tempfile PSIDFathers_1970_`c'
		save `PSIDFathers_1970_`c''
		
	}

*-----------------------------------------------------------------*
*-----------------------------------------------------------------*

*************************************
/* Save one file with all versions 
   of father income  */
*************************************

	use `fulldata', clear
	bysort father_id: keep if _n==1
	keep father_id mode* father_occ* number_* divorced_ever incomescore*
	
	merge 1:1 father_id using `PSIDFathers_1'
	drop _merge
	merge 1:1 father_id using `PSIDFathers_5'
	drop _merge
	merge 1:1 father_id using `PSIDFathers_10'
	drop _merge
	
	merge 1:1 father_id using `PSIDFathers_1970_1'
	drop _merge
	
	merge 1:1 father_id using `PSIDFathers_1970_3'
	drop _merge
	
	save ./output/Fathers_modaloccs.dta, replace
